1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.solr.handler.component;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Collection;
24  import java.util.Collections;
25  import java.util.EnumSet;
26  import java.util.IdentityHashMap;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  
32  import org.apache.commons.lang.StringUtils;
33  import org.apache.lucene.document.FieldType.NumericType;
34  import org.apache.lucene.index.LeafReaderContext;
35  import org.apache.lucene.queries.function.FunctionQuery;
36  import org.apache.lucene.queries.function.ValueSource;
37  import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
38  import org.apache.lucene.queries.function.valuesource.QueryValueSource;
39  import org.apache.lucene.search.Query;
40  import org.apache.solr.common.SolrException;
41  import org.apache.solr.common.SolrException.ErrorCode;
42  import org.apache.solr.common.params.CommonParams;
43  import org.apache.solr.common.params.ModifiableSolrParams;
44  import org.apache.solr.common.params.SolrParams;
45  import org.apache.solr.common.params.StatsParams;
46  import org.apache.solr.common.util.StrUtils;
47  import org.apache.solr.request.DocValuesStats;
48  import org.apache.solr.request.SolrQueryRequest;
49  import org.apache.solr.schema.IndexSchema;
50  import org.apache.solr.schema.SchemaField;
51  import org.apache.solr.search.DocIterator;
52  import org.apache.solr.search.DocSet;
53  import org.apache.solr.search.QParser;
54  import org.apache.solr.search.QParserPlugin;
55  import org.apache.solr.search.QueryParsing;
56  import org.apache.solr.search.SolrIndexSearcher;
57  import org.apache.solr.search.SyntaxError;
58  import org.apache.solr.util.hll.HLL;
59  import org.apache.solr.util.hll.HLLType;
60  
61  import com.google.common.hash.Hashing;
62  import com.google.common.hash.HashFunction;
63  
64  /**
65   * Models all of the information associated with a single {@link StatsParams#STATS_FIELD}
66   * instance.
67   *
68   * @see StatsComponent
69   */
70  public class StatsField {
71    
72    /**
73     * An enumeration representing the sumer set of all possible stat values that can be computed.
74     * Each of these enum values can be specified as a local param in a <code>stats.field</code> 
75     * (eg: <code>stats.field={!min=true mean=true}my_field_name</code>) but not all enum values 
76     * are valid for all field types (eg: <code>mean</code> is meaningless for String fields)
77     *
78     * @lucene.internal
79     * @lucene.experimental
80     */
81    public static enum Stat {
82      min(true),
83      max(true),
84      missing(true),
85      sum(true),
86      count(true),
87      mean(false, sum, count),
88      sumOfSquares(true),
89      stddev(false, sum, count, sumOfSquares),
90      distinctValues(true),
91      countDistinct(false, distinctValues),
92      percentiles(true){
93        /** special for percentiles **/
94        boolean parseParams(StatsField sf) {
95          String percentileParas = sf.localParams.get(this.name());
96          if (percentileParas != null) {
97            List<Double> percentiles = new ArrayList<Double>();
98            try {
99              for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
100               percentiles.add(Double.parseDouble(percentile));
101             }
102             if (!percentiles.isEmpty()) {
103               sf.percentilesList.addAll(percentiles);
104               sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression", 
105                                                                sf.tdigestCompression);
106               return true;
107             }
108           } catch (NumberFormatException e) {
109             throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
110                 + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
111                 + e.getMessage(), e);
112           }
113 
114         }
115         return false;
116       }
117     },
118     cardinality(true) { 
119       /** special for percentiles **/
120       boolean parseParams(StatsField sf) {
121         try {
122           sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
123           return (null != sf.hllOpts);
124         } catch (Exception e) {
125           throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
126               + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
127               + e.getMessage(), e);
128         }
129       }
130     };
131 
132     private final List<Stat> distribDeps;
133     
134     /**
135      * Sole constructor for Stat enum values
136      * @param deps the set of stat values, other then this one, which are a distributed 
137      *        dependency and must be computed and returned by each individual shards in 
138      *        order to compute <i>this</i> stat over the entire distributed result set.
139      * @param selfDep indicates that when computing this stat across a distributed result 
140      *        set, each shard must compute this stat <i>in addition to</i> any other 
141      *        distributed dependences.
142      * @see #getDistribDeps
143      */
144     Stat(boolean selfDep, Stat... deps) {
145       distribDeps = new ArrayList<Stat>(deps.length+1);
146       distribDeps.addAll(Arrays.asList(deps));
147       if (selfDep) { 
148         distribDeps.add(this);
149       }
150     }
151     
152     /**
153      * Given a String, returns the corrisponding Stat enum value if any, otherwise returns null.
154      */
155     public static Stat forName(String paramKey) {
156       try {
157         return Stat.valueOf(paramKey);
158       } catch (IllegalArgumentException e) {
159         return null;
160       }
161     }
162     
163     /**
164      * The stats that must be computed and returned by each shard involved in a distributed 
165      * request in order to compute the overall value for this stat across the entire distributed 
166      * result set.  A Stat instance may include itself in the <code>getDistribDeps()</code> result,
167      * but that is not always the case.
168      */
169     public EnumSet<Stat> getDistribDeps() {
170       return EnumSet.copyOf(this.distribDeps);
171     }
172     
173     /** 
174      * Called when the name of a stat is found as a local param on this {@link StatsField}
175      * @return true if the user is requesting this stat, else false
176      */
177     boolean parseParams(StatsField sf) {
178       return sf.localParams.getBool(this.name(), false);
179     }
180     
181   }
182 
183   /**
184    * the equivilent stats if "calcdistinct" is specified
185    * @see Stat#countDistinct
186    * @see Stat#distinctValues
187    */
188   private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct, Stat.distinctValues);
189 
190   /**
191    * The set of stats computed by default when no localparams are used to specify explicit stats 
192    */
193   public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet
194     (EnumSet.of(Stat.min, Stat.max, Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev));
195 
196   private final SolrIndexSearcher searcher;
197   private final ResponseBuilder rb;
198   private final String originalParam; // for error messages
199   private final SolrParams localParams;
200   private final ValueSource valueSource; // may be null if simple field stats
201   private final SchemaField schemaField; // may be null if function/query stats
202   private final String key;
203   private final boolean  topLevelCalcDistinct;
204   private final String[] facets;
205   private final List<String> tagList;
206   private final List<String> excludeTagList;
207   private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class);
208   private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class);
209   private final List<Double> percentilesList= new ArrayList<Double>();
210   private final boolean isShard;
211   
212   private double tdigestCompression = 100.0D;
213   private HllOptions hllOpts;
214   
215   /**
216    * @param rb the current request/response
217    * @param statsParam the raw {@link StatsParams#STATS_FIELD} string
218    */
219   public StatsField(ResponseBuilder rb, String statsParam) { 
220     this.rb = rb;
221     this.searcher = rb.req.getSearcher();
222     this.originalParam = statsParam;
223 
224     SolrParams params = rb.req.getParams();
225     try {
226       isShard = params.getBool("isShard", false);
227       SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
228       if (null == localParams) {
229         // simplest possible input: bare string (field name)
230         ModifiableSolrParams customParams = new ModifiableSolrParams();
231         customParams.add(QueryParsing.V, originalParam);
232         localParams = customParams;
233       }
234 
235       this.localParams = localParams;
236       
237       String parserName = localParams.get(QueryParsing.TYPE);
238       SchemaField sf = null;
239       ValueSource vs = null;
240 
241       if ( StringUtils.isBlank(parserName) ) {
242 
243         // basic request for field stats
244         sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));
245 
246       } else {
247         // we have a non trivial request to compute stats over a query (or function)
248 
249         // NOTE we could use QParser.getParser(...) here, but that would redundently
250         // reparse everything.  ( TODO: refactor a common method in QParser ?)
251         QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
252         QParser qp =  qplug.createParser(localParams.get(QueryParsing.V), 
253                                          localParams, params, rb.req);
254 
255         // figure out what type of query we are dealing, get the most direct ValueSource
256         vs = extractValueSource(qp.parse());
257 
258         // if this ValueSource directly corrisponds to a SchemaField, act as if
259         // we were asked to compute stats on it directly
260         // ie:  "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
261         sf = extractSchemaField(vs, searcher.getSchema());
262         if (null != sf) {
263           vs = null;
264         }
265       }
266       
267       assert ( (null == vs) ^ (null == sf) ) : "exactly one of vs & sf must be null";
268       
269       this.schemaField = sf;
270       this.valueSource = vs;
271 
272     } catch (SyntaxError e) {
273       throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " + 
274                               StatsParams.STATS_FIELD + ": " + originalParam + " due to: "
275                               + e.getMessage(), e);
276     }
277 
278     // allow explicit setting of the response key via localparams...
279     this.key = localParams.get(CommonParams.OUTPUT_KEY, 
280                                // default to the main param value...
281                                localParams.get(CommonParams.VALUE, 
282                                                // default to entire original param str.
283                                                originalParam));
284 
285     this.topLevelCalcDistinct = null == schemaField
286         ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false) 
287         : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);
288 
289     populateStatsSets();
290         
291     String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
292     this.facets = (null == facets) ? new String[0] : facets;
293     String tagStr = localParams.get(CommonParams.TAG);
294     this.tagList = (null == tagStr)
295         ? Collections.<String>emptyList()
296         : StrUtils.splitSmart(tagStr,',');
297 
298     // figure out if we need a special base DocSet
299     String excludeStr = localParams.get(CommonParams.EXCLUDE);
300     this.excludeTagList = (null == excludeStr) 
301       ? Collections.<String>emptyList()
302       : StrUtils.splitSmart(excludeStr,',');
303 
304     assert ( (null == this.valueSource) ^ (null == this.schemaField) ) 
305       : "exactly one of valueSource & schemaField must be null";
306   }
307 
308   /**
309    * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource},
310    * and if so returns it -- otherwise wraps it as needed.
311    *
312    * @param q Query whose scores we have been asked to compute stats of
313    * @returns a ValueSource to use for computing the stats
314    */
315   private static ValueSource extractValueSource(Query q) {
316     return (q instanceof FunctionQuery) ?
317       // Common case: we're wrapping a func, so we can directly pull out ValueSource
318       ((FunctionQuery) q).getValueSource() :
319       // asked to compute stats over a query, wrap it up as a ValueSource
320       new QueryValueSource(q, 0.0F);
321   }
322 
323   /**
324    * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField}, 
325    * and if so returns it.
326    *
327    * @param vs ValueSource we've been asked to compute stats of
328    * @param schema The Schema to use
329    * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
330    * @see FieldCacheSource
331    */
332   private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
333     if (vs instanceof FieldCacheSource) {
334       String fieldName = ((FieldCacheSource)vs).getField();
335       return schema.getField(fieldName);
336     }
337     return null;
338   }
339 
340   /** 
341    * The key to be used when refering to this {@link StatsField} instance in the 
342    * response tp clients.
343    */
344   public String getOutputKey() {
345     return key;
346   }
347 
348   /**
349    * Computes a base {@link DocSet} for the current request to be used
350    * when computing global stats for the local index.
351    *
352    * This is typically the same as the main DocSet for the {@link ResponseBuilder}
353    * unless {@link CommonParams#TAG tag}ged filter queries have been excluded using 
354    * the {@link CommonParams#EXCLUDE ex} local param
355    */
356   public DocSet computeBaseDocSet() throws IOException {
357 
358     DocSet docs = rb.getResults().docSet;
359     Map<?,?> tagMap = (Map<?,?>) rb.req.getContext().get("tags");
360 
361     if (excludeTagList.isEmpty() || null == tagMap) {
362       // either the exclude list is empty, or there
363       // aren't any tagged filters to exclude anyway.
364       return docs;
365     }
366 
367     IdentityHashMap<Query,Boolean> excludeSet = new IdentityHashMap<Query,Boolean>();
368     for (String excludeTag : excludeTagList) {
369       Object olst = tagMap.get(excludeTag);
370       // tagMap has entries of List<String,List<QParser>>, but subject to change in the future
371       if (!(olst instanceof Collection)) continue;
372       for (Object o : (Collection<?>)olst) {
373         if (!(o instanceof QParser)) continue;
374         QParser qp = (QParser)o;
375         try {
376           excludeSet.put(qp.getQuery(), Boolean.TRUE);
377         } catch (SyntaxError e) {
378           // this shouldn't be possible since the request should have already
379           // failed when attempting to execute the query, but just in case...
380           throw new SolrException(ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " + 
381                                   originalParam + " due to: " + e.getMessage(), e);
382         }
383       }
384     }
385     if (excludeSet.size() == 0) return docs;
386     
387     List<Query> qlist = new ArrayList<Query>();
388     
389     // add the base query
390     if (!excludeSet.containsKey(rb.getQuery())) {
391       qlist.add(rb.getQuery());
392     }
393     
394     // add the filters
395     if (rb.getFilters() != null) {
396       for (Query q : rb.getFilters()) {
397         if (!excludeSet.containsKey(q)) {
398           qlist.add(q);
399         }
400       }
401     }
402     
403     // get the new base docset for this facet
404     return searcher.getDocSet(qlist);
405   }
406 
407   /**
408    * Computes the {@link StatsValues} for this {@link StatsField} relative to the 
409    * specified {@link DocSet} 
410    * @see #computeBaseDocSet
411    */
412   public StatsValues computeLocalStatsValues(DocSet base) throws IOException {
413 
414     if (statsToCalculate.isEmpty()) { 
415       // perf optimization for the case where we compute nothing
416       // ie: stats.field={!min=$domin}myfield&domin=false
417       return StatsValuesFactory.createStatsValues(this);
418     }
419 
420     if (null != schemaField 
421         && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {
422 
423       // TODO: should this also be used for single-valued string fields? (should work fine)
424       return DocValuesStats.getCounts(searcher, this, base, facets);
425     } else {
426       // either a single valued field we pull from FieldCache, or an explicit
427       // function ValueSource
428       return computeLocalValueSourceStats(base);
429     }
430   }
431 
432   private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {
433 
434     IndexSchema schema = searcher.getSchema();
435 
436     final StatsValues allstats = StatsValuesFactory.createStatsValues(this);
437 
438     List<FieldFacetStats> facetStats = new ArrayList<>();
439     for( String facetField : facets ) {
440       SchemaField fsf = schema.getField(facetField);
441 
442       if ( fsf.multiValued()) {
443         throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
444           "Stats can only facet on single-valued fields, not: " + facetField );
445       }
446 
447       facetStats.add(new FieldFacetStats(searcher, fsf, this));
448     }
449 
450     final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator();
451     LeafReaderContext ctx = null;
452     for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) {
453       final int doc = docsIt.nextDoc();
454       if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
455         // advance
456         do {
457           ctx = ctxIt.next();
458         } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
459         assert doc >= ctx.docBase;
460 
461         // propagate the context among accumulators.
462         allstats.setNextReader(ctx);
463         for (FieldFacetStats f : facetStats) {
464           f.setNextReader(ctx);
465         }
466       }
467 
468       // accumulate
469       allstats.accumulate(doc - ctx.docBase);
470       for (FieldFacetStats f : facetStats) {
471         f.facet(doc - ctx.docBase);
472       }
473     }
474 
475     for (FieldFacetStats f : facetStats) {
476       allstats.addFacet(f.name, f.facetStatsValues);
477     }
478     return allstats;
479   }
480 
481   /**
482    * The searcher that should be used for processing local stats
483    * @see SolrQueryRequest#getSearcher
484    */
485   public SolrIndexSearcher getSearcher() {
486     // see AbstractStatsValues.setNextReader
487 
488     return searcher;
489   }
490 
491   /**
492    * The {@link SchemaField} whose results these stats are computed over, may be null 
493    * if the stats are computed over the results of a function or query
494    *
495    * @see #getValueSource
496    */
497   public SchemaField getSchemaField() {
498     return schemaField;
499   }
500 
501   /**
502    * The {@link ValueSource} of a function or query whose results these stats are computed 
503    * over, may be null if the stats are directly over a {@link SchemaField}
504    *
505    * @see #getValueSource
506    */
507   public ValueSource getValueSource() {
508     return valueSource;
509   }
510 
511   public List<String> getTagList() {
512     return tagList;
513   }
514 
515   public String toString() {
516     return "StatsField<" + originalParam + ">";
517   }
518 
519   /**
520    * A helper method which inspects the {@link #localParams} associated with this StatsField, 
521    * and uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data 
522    * structures
523    */
524   private void populateStatsSets() {
525     boolean statSpecifiedByLocalParam = false;
526     // local individual stat
527     Iterator<String> itParams = localParams.getParameterNamesIterator();
528     
529     while (itParams.hasNext()) {
530       String paramKey = itParams.next();
531       Stat stat = Stat.forName(paramKey);
532       if (stat != null) {
533         statSpecifiedByLocalParam = true;
534         if (stat.parseParams(this)) {
535           statsInResponse.add(stat);
536         }
537       }
538     }
539 
540     // if no individual stat setting use the default set
541     if ( ! ( statSpecifiedByLocalParam
542              // calcdistinct (as a local param) is a psuedo-stat, prevents default set
543              || localParams.getBool("calcdistinct", false) ) ) {
544       statsInResponse.addAll(DEFAULT_STATS);
545     }
546 
547     // calcDistinct is a psuedo-stat with optional top level param default behavior
548     // if not overridden by the specific individual stats
549     if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
550       for (Stat stat : CALCDISTINCT_PSUEDO_STAT) {
551         // assume true, but don't include if specific stat overrides
552         if (localParams.getBool(stat.name(), true)) {
553           statsInResponse.add(stat);
554         }
555       }
556     }
557 
558     for (Stat stat : statsInResponse) {
559       statsToCalculate.addAll(stat.getDistribDeps());
560     }
561   }
562 
563   public boolean calculateStats(Stat stat) {
564     return statsToCalculate.contains(stat);
565   }
566   
567   public boolean includeInResponse(Stat stat) {
568     if (isShard) {
569       return statsToCalculate.contains(stat);
570     }
571    
572     if (statsInResponse.contains(stat)) {
573       return true;
574     }
575     return false;
576   }
577 
578   public List<Double> getPercentilesList() {
579     return percentilesList;
580   }
581   
582   public boolean getIsShard() {
583     return isShard;
584   }
585   
586   public double getTdigestCompression() {
587     return tdigestCompression;
588   }
589 
590   public HllOptions getHllOptions() {
591     return hllOpts;
592   }
593 
594   /**
595    * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
596    *
597    * @see Stat#cardinality
598    * @lucene.internal
599    */
600   public static final class HllOptions {
601     final HashFunction hasher;
602     
603     // NOTE: this explanation linked to from the java-hll jdocs...
604     // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
605     // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
606     // to support any max cardinality given that we're always dealing with hashes and 
607     // the cardinality of the set of all long values is 2**64 == 1.9e19
608     //
609     // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect 
610     // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values 
611     // might fall in the same register (ie: bucket) and having a wider register to count more of 
612     // them may be useful
613 
614     final int log2m;  
615     final int regwidth;
616     
617     final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";
618 
619     private HllOptions(int log2m, int regwidth, HashFunction hasher) {
620       this.log2m = log2m;
621       this.regwidth = regwidth;
622       this.hasher = hasher;
623     }
624     /** 
625      * Creates an HllOptions based on the (local) params specified (if appropriate).
626      *
627      * @param localParams the LocalParams for this {@link StatsField}
628      * @param field the field corresponding to this {@link StatsField}, may be null if these stats are over a value source
629      * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should be computed
630      * @throws SolrException if there are invalid options
631      */
632     public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field) 
633       throws SolrException {
634 
635       String cardinalityOpt = localParams.get(Stat.cardinality.name());
636       if (StringUtils.isBlank(cardinalityOpt)) {
637         return null;
638       }
639 
640       final NumericType hashableNumType = getHashableNumericType(field);
641 
642       // some sane defaults
643       int log2m = 13;   // roughly equivilent to "cardinality='0.33'"
644       int regwidth = 6; // with decent hash, this is plenty for all valid long hashes
645 
646       if (NumericType.FLOAT.equals(hashableNumType) || NumericType.INT.equals(hashableNumType)) {
647         // for 32bit values, we can adjust our default regwidth down a bit
648         regwidth--;
649 
650         // NOTE: EnumField uses NumericType.INT, and in theory we could be super conservative
651         // with it, but there's no point - just let the EXPLICIT HLL handle it
652       }
653 
654       // TODO: we could attempt additional reductions in the default regwidth based on index
655       // statistics -- but thta doesn't seem worth the effort.  for tiny indexes, the 
656       // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't 
657       // want to be too aggresive about lowering regwidth or we could really poor results if 
658       // log2m is also low and  there is heavy hashkey collision
659 
660       try {
661         // NFE will short out here if it's not a number
662         final double accuracyOpt = Double.parseDouble(cardinalityOpt);
663 
664         // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
665         // - 0 means accuracy is not a concern, save RAM
666         // - 1 means be as accurate as possible, using as much RAM as needed.
667 
668         if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
669           throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
670         }
671 
672         // use accuracyOpt as a scaling factor between min & max legal log2m values
673         log2m = HLL.MINIMUM_LOG2M_PARAM
674           + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));
675 
676         // use accuracyOpt as a scaling factor for regwidth as well, BUT...
677         // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
678         // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
679         final int MIN_HUERISTIC_REGWIDTH = regwidth-1;
680         regwidth = MIN_HUERISTIC_REGWIDTH
681           + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));
682 
683       } catch (NumberFormatException nfe) {
684         // param value isn't a number -- let's check for simple true/false
685         if (! localParams.getBool(Stat.cardinality.name(), false)) {
686           return null;
687         }
688       }
689 
690       // let explicit params override both the default and/or any accuracy specification
691       log2m = localParams.getInt("hllLog2m", log2m);
692       regwidth = localParams.getInt("hllRegwidth", regwidth);
693 
694       // validate legal values
695       if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
696         throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " + 
697                                 HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM
698                                 + " (" + log2m +")");
699       }
700       if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
701         throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " + 
702                                 HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
703       }
704       
705       HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();
706 
707       if (null == hasher) {
708         // if this is a function, or a non Long field, pre-hashed is invalid
709         // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
710         if (null == field || !NumericType.LONG.equals(field.getType().getNumericType())) {
711           throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
712         }
713       }
714 
715       // if we're still here, then we need an HLL...
716       return new HllOptions(log2m, regwidth, hasher);
717     }
718     /** @see HLL */
719     public int getLog2m() {
720       return log2m;
721     }
722     /** @see HLL */
723     public int getRegwidth() {
724       return regwidth;
725     }
726     /** May be null if user has indicated that field values are pre-hashed */
727     public HashFunction getHasher() {
728       return hasher;
729     }
730     public HLL newHLL() {
731       // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have
732       // some nasty impacts on response time as it gets larger - particularly in distrib requests.
733       // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs
734       //
735       // TODO: add more tunning options for this.
736       return new HLL(getLog2m(), getRegwidth(), -1 /* auto explict threshold */,
737                      false /* no sparse representation */, HLLType.EMPTY);
738                      
739     }
740   }
741 
742   /**
743    * Returns the effective {@link NumericType} for the field for the purposes of hash values.  
744    * ie: If the field has an explict NumericType that is returned; If the field has no explicit 
745    * NumericType then {@link NumericType#LONG} is returned;  If field is null, then 
746    * {@link NumericType#FLOAT} is assumed for ValueSource.
747    */
748   private static NumericType getHashableNumericType(SchemaField field) {
749     if (null == field) {
750       return NumericType.FLOAT;
751     }
752     final NumericType result = field.getType().getNumericType();
753     return null == result ? NumericType.LONG : result;
754   }
755 }